import pandas as pd
import numpy as np
data_asset_classes = pd.read_excel('Data_Bloom.xlsx',
sheet_name='AssetClasses',
skiprows=1,
usecols=lambda x: 'Unnamed' not in x)
data_macro_bb = pd.read_excel('Data_Bloom.xlsx',sheet_name='Macro_BB',
header=4)
data_macro_bb = data_macro_bb.drop(columns={'Ticker'})
data_macro_bb = data_macro_bb.drop(0)
data_macro_bb
# Import necessary modules and set options
import itertools
import seaborn as sns
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, ElasticNetCV, LarsCV
from sklearn.cross_decomposition import PLSRegression
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings("ignore")
from matplotlib import pyplot as plt
data_asset_classes
# create y for all asset classes
equity = np.array(data_asset_classes['EQUITIES'])
fi = np.array(data_asset_classes['FIXED INCOME'])
credit = np.array(data_asset_classes['CREDIT'])
commo = np.array(data_asset_classes['COMMO'])
equity = pd.DataFrame(equity)[0].shift(1) - pd.DataFrame(equity)[0]
fi = pd.DataFrame(fi)[0].shift(1) - pd.DataFrame(fi)[0]
credit = pd.DataFrame(credit)[0].shift(1) - pd.DataFrame(credit)[0]
commo = pd.DataFrame(commo)[0].shift(1) - pd.DataFrame(commo)[0]
equity = - equity[1:]
fi = - fi[1:]
credit = - credit[1:]
commo = - commo[1:]
X = pd.DataFrame(data_macro_bb)
X_original = np.array(X.copy())
for i in X.columns:
X[i] = pd.DataFrame(X[i]) - pd.DataFrame(X[i]).shift(1)
X = X.drop(1)
X
fig, axs = plt.subplots(2, 2, figsize=(14, 7), dpi= 80, facecolor='w', edgecolor='k')
axs[0, 0].plot(equity)
axs[0, 0].set_title('equity')
axs[0, 1].plot(fi, 'tab:orange')
axs[0, 1].set_title('fixed incomes')
axs[1, 0].plot(credit, 'tab:green')
axs[1, 0].set_title('credit')
axs[1, 1].plot(commo, 'tab:red')
axs[1, 1].set_title('commodities')
for ax in axs.flat:
ax.set(xlabel='x-label', ylabel='y-label')
# Hide x labels and tick labels for top plots and y ticks for right plots.
for ax in axs.flat:
ax.label_outer()
X = np.array(X)
from statsmodels.tsa.stattools import adfuller, kpss
# ADF Test
result = adfuller(pd.DataFrame(X[:,0]).values, autolag='AIC')
print(f'ADF Statistic: {result[0]}')
print(f'p-value: {result[1]}')
for key, value in result[4].items():
print('Critical Values:')
print(f' {key}, {value}')
# KPSS Test
result = kpss(pd.DataFrame(X[:,0]).values, regression='c')
print('\nKPSS Statistic: %f' % result[0])
print('p-value: %f' % result[1])
for key, value in result[3].items():
print('Critical Values:')
print(f' {key}, {value}')
fig, axs = plt.subplots(4, 4, figsize=(14, 7), dpi= 80, facecolor='w', edgecolor='k')
for i in range(4):
for j in range(4):
axs[i, j].plot(X_original[:,i+j])
axs[i, j].set_title(data_macro_bb.columns.to_list()[i+j])
for ax in axs.flat:
ax.set(xlabel='x-label', ylabel='y-label')
# Hide x labels and tick labels for top plots and y ticks for right plots.
for ax in axs.flat:
ax.label_outer()
#etude de la correlation
sns.set(rc={'figure.figsize':(11.7,8.27)})
X_corr = pd.DataFrame(X.astype(float)).corr().round(1)
sns.heatmap(data=X_corr, annot=True, cmap="RdBu_r")
fig, axs = plt.subplots(4, 4, figsize=(14, 7), dpi= 80, facecolor='w', edgecolor='k')
for i in range(4):
for j in range(4):
axs[i, j].plot(X[:,i+j])
axs[i, j].set_title(data_macro_bb.columns.to_list()[i+j])
for ax in axs.flat:
ax.set(xlabel='x-label', ylabel='y-label')
# Hide x labels and tick labels for top plots and y ticks for right plots.
for ax in axs.flat:
ax.label_outer()
# from sklearn.preprocessing import MinMaxScaler
# scaler = MinMaxScaler(feature_range=(-1, 1))
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = pd.DataFrame(X)
equity = pd.DataFrame(equity)
fi = pd.DataFrame(fi)
credit = pd.DataFrame(credit)
commo = pd.DataFrame(commo)
X = scaler.fit_transform(X)
equity = scaler.fit_transform(equity)
fi = scaler.fit_transform(fi)
credit = scaler.fit_transform(credit)
commo = scaler.fit_transform(commo)
X = pd.DataFrame(X)
equity = pd.DataFrame(equity)
fi = pd.DataFrame(fi)
credit = pd.DataFrame(credit)
commo = pd.DataFrame(commo)
X
# split into train and test sets
X_train, X_test, y_equity_train, y_equity_test = train_test_split(X, equity, test_size=0.2, shuffle=False)
X_train, X_test, y_fi_train, y_fi_test = train_test_split(X, fi, test_size=0.2, shuffle=False)
X_train, X_test, y_credit_train, y_credit_test = train_test_split(X, credit, test_size=0.2, shuffle=False)
X_train, X_test, y_commo_train, y_commo_test = train_test_split(X, commo, test_size=0.2, shuffle=False)
reg = LinearRegression()
Lin_reg_equity = reg.fit(X_train, y_equity_train)
reg = LinearRegression()
Lin_reg_fi = reg.fit(X_train, y_fi_train)
reg = LinearRegression()
Lin_reg_credit = reg.fit(X_train, y_credit_train)
reg = LinearRegression()
Lin_reg_commo = reg.fit(X_train, y_commo_train)
print('training score for equity', Lin_reg_equity.score(X_train, y_equity_train))
print('training score for fi', Lin_reg_fi.score(X_train, y_fi_train))
print('training score for credit', Lin_reg_credit.score(X_train, y_credit_train))
print('training score for commo', Lin_reg_commo.score(X_train, y_commo_train))
print('coefs are', Lin_reg_equity.coef_)
print('intercept is', Lin_reg_equity.intercept_)
y_equity_pred = Lin_reg_equity.predict(X_test)
print('RMSE is:', mean_squared_error(y_equity_test, y_equity_pred, squared=False))
print('mean error is:', np.mean(np.abs(y_equity_test - y_equity_pred)))
sns.set(rc={'figure.figsize':(6,5)})
plt.plot(range(len(y_equity_pred)), y_equity_pred, c='red')
plt.plot(range(len(y_equity_pred)), y_equity_test, c='green')
y_fi_pred = Lin_reg_fi.predict(X_test)
plt.plot(range(len(y_fi_pred)), y_fi_pred, c='red')
plt.plot(range(len(y_fi_pred)), y_fi_test, c='green')
y_credit_pred = Lin_reg_credit.predict(X_test)
plt.plot(range(len(y_credit_pred)), y_credit_pred, c='red')
plt.plot(range(len(y_credit_pred)), y_credit_test, c='green')
y_commo_pred = Lin_reg_commo.predict(X_test)
plt.plot(range(len(y_commo_pred)), y_commo_pred, c='red')
plt.plot(range(len(y_commo_pred)), y_commo_test, c='green')
# test sans CV
from sklearn.linear_model import Lasso
from sklearn.model_selection import cross_val_score
regLasso1 = Lasso(fit_intercept=True,normalize=True, alpha=1)
regLasso1.fit(X_train, y_equity_train)
print('score train',regLasso1.score(X_train, y_equity_train))
print('score test',regLasso1.score(X_test, y_equity_test))
print('coefs',regLasso1.coef_)
# my_alphas = np.array([0, 0.001, 0.002, 0.003, 0.004, 0.005, 0.01, 0.015, 0.02, 0.025, 0.05, 0.1, 1.0])
my_alphas = np.linspace(0, 1, 1000)
from sklearn.linear_model import lasso_path
alpha_for_path, coefs_lasso, _ = lasso_path(X_train, y_equity_train.stack(), alphas=my_alphas)
#jeu de couleurs pour faire joli
import matplotlib.cm as cm
couleurs = cm.rainbow(np.linspace(0,1,16))
#graphique lasso path (une courbe par variable)
for i in range(coefs_lasso.shape[0]):
plt.plot(alpha_for_path,coefs_lasso[i,:],c=couleurs[i])
plt.xlabel('Alpha')
plt.ylabel('Coefficients')
plt.title('Lasso path')
plt.show()
y_assets_train = [y_equity_train, y_fi_train, y_credit_train, y_commo_train]
y_assets_test = [y_equity_test, y_fi_test, y_credit_test, y_commo_test]
y_names = ['EQUITIES', 'FIXED INCOME', 'CREDIT', 'COMMODITIES']
dict_assets={}
RMSEs=[]
for asset_train, asset_test, name in zip(y_assets_train, y_assets_test, y_names):
train_r_squared = np.zeros(len(my_alphas))
test_r_squared = np.zeros(len(my_alphas))
RMSE = np.zeros(len(my_alphas))
params=[]
i=0
for alpha in my_alphas:
regLasso = Lasso(fit_intercept=True,normalize=True, alpha=alpha)
regLasso.fit(X_train, asset_train)
y_pred = regLasso.predict(X_test)
results = cross_val_score(regLasso, X_test, asset_test, cv=5, scoring="r2")
train_r_squared[i] = regLasso.score(X_train, asset_train)
test_r_squared[i] = regLasso.score(X_test, asset_test)
RMSE[i] = mean_squared_error(asset_test, y_pred, squared=False)
params.append(regLasso.coef_)
i+=1
RMSEs.append(RMSE)
min_alpha = my_alphas[np.argmin(RMSE)]
dict_assets.update({ name :
{ 'min alpha' : min_alpha,
'beta_list' : params[np.argmin(RMSE)]}})
# print(f'alpha : {alpha} RMSE : {mean_squared_error(y_equity_test, y_equity_pred, squared=False)}')
# alpha/RMSE curve for asset classes
for i in range(1, len(RMSEs) +1):
fig = plt.figure()
ax = fig.add_subplot(4, 1, i)
plot, = ax.plot(my_alphas,RMSEs[i-1])
ax.set_xscale('log')
plt.title(y_names[i-1])
dict_assets
import plotly.express as px
for asset in dict_assets.keys():
df_coefs_equity = pd.DataFrame(dict_assets[asset]['beta_list'])
fig = px.bar(df_coefs_equity,
x=data_macro_bb.columns,
y=df_coefs_equity.values,
color=0, title=asset)
fig.show()
#test cross validation for EQUITIES
lasso_cv = LassoCV(normalize=True, alphas=np.logspace(-10, 1, 400), max_iter=10000)
lasso_model_equity = lasso_cv.fit(X_train, y_equity_train)
lasso_prediction_equity = lasso_model_equity.predict(X_test)
lasso_mae_equity = mean_squared_error(lasso_prediction_equity, y_equity_pred, squared=False)
lasso_coefs_equity = dict(
zip(['Intercept'] + data_macro_bb.columns.tolist()[:-1],
np.round(np.concatenate((lasso_model_equity.intercept_, lasso_model_equity.coef_), axis=None), 3))
)
print('LASSO MAE: {}'.format(np.round(lasso_mae_equity, 3)))
print('LASSO coefficients:')
df_lasso_coefs_equity = pd.DataFrame.from_dict([lasso_coefs_equity]).T
import plotly.express as px
fig = px.bar(df_lasso_coefs_equity,
x=df_lasso_coefs_equity.index,
y=df_lasso_coefs_equity.values,
color=0)
fig.show()
X = data_macro_bb
X = X.dropna().reset_index(drop=True)
dyn_coefs=[]
y_names = ['EQUITIES', 'FIXED INCOME', 'CREDIT', 'COMMODITIES']
for i in range(5):
down = i*30
mid = down + 24
up = (i+1)*30
y_assets_train = [equity.iloc[down: mid, :], fi.iloc[down: mid, :], credit.iloc[down: mid, :], commo.iloc[down: mid, :]]
y_assets_test = [equity.iloc[mid: up, :], fi.iloc[mid: up, :], credit.iloc[mid: up, :], commo.iloc[mid: up, :]]
dict_assets={}
RMSEs=[]
for asset_train, asset_test, name in zip(y_assets_train, y_assets_test, y_names):
train_r_squared = np.zeros(len(my_alphas))
test_r_squared = np.zeros(len(my_alphas))
RMSE = np.zeros(len(my_alphas))
params=[]
i=0
for alpha in my_alphas:
regLasso = Lasso(fit_intercept=True,normalize=True, alpha=alpha)
regLasso.fit(X.iloc[down: mid, :], asset_train)
y_pred = regLasso.predict(X.iloc[mid: up, :])
train_r_squared[i] = regLasso.score(X.iloc[down: mid, :], asset_train)
RMSE[i] = mean_squared_error(asset_test, y_pred, squared=False)
params.append(regLasso.coef_)
i+=1
RMSEs.append(RMSE)
min_alpha = my_alphas[np.argmin(RMSE)]
dict_assets.update({ name :
{ 'min alpha' : min_alpha,
'beta_list' : params[np.argmin(RMSE)]}})
dyn_coefs.append(dict_assets)
all_coefs=[]
for name in y_names:
coefs = []
for i in range(len(dyn_coefs)):
temp = dyn_coefs[i][name]['beta_list'].tolist()
coefs.append(temp)
all_coefs.append(coefs)
list_all_df=[]
for i in range(len(all_coefs)):
df_temp = pd.DataFrame(all_coefs[i], columns=X.columns)
list_all_df.append(df_temp)
list_all_df
plt.plot(list_all_df[0])
plt.title('EQUITIES')
plt.show
plt.plot(list_all_df[1])
plt.title('FIXED INCOME')
plt.show
plt.plot(list_all_df[2])
plt.title('CREDIT')
plt.show
plt.plot(list_all_df[3])
plt.title('COMMODITIES')
plt.show
def soft_threshold(rho,lamda):
'''Soft threshold function used for normalized data and lasso regression'''
if rho < - lamda:
return (rho + lamda)
elif rho > lamda:
return (rho - lamda)
else:
return 0
def coordinate_descent_lasso(theta,X,y,lamda = .01, num_iters=100, intercept = False):
'''Coordinate gradient descent for lasso regression - for normalized data.
The intercept parameter allows to specify whether or not we regularize theta_0'''
#Initialisation of useful values
m,n = X.shape
X = X / (np.linalg.norm(X,axis = 0)) #normalizing X in case it was not done before
#Looping until max number of iterations
for i in range(num_iters):
#Looping through each coordinate
for j in range(n):
#Vectorized implementation
X_j = X[:,j].reshape(-1,1)
y_pred = X @ theta
rho = X_j.T @ (y - y_pred + theta[j]*X_j)
#Checking intercept parameter
if intercept == True:
if j == 0:
theta[j] = rho
else:
theta[j] = soft_threshold(rho, lamda)
if intercept == False:
theta[j] = soft_threshold(rho, lamda)
return theta.flatten()
equity:
X = np.array(X_train).astype(np.float)
y = np.array(y_equity_train).astype(np.float)
m,n = X.shape
initial_theta = np.ones((n,1))
theta_list = list()
lamda = np.logspace(0,4,300)/10 #Range of lambda values
#Run lasso regression for each lambda
for l in lamda:
theta = coordinate_descent_lasso(initial_theta,X,y,lamda = l, num_iters=100)
theta_list.append(theta)
#Stack into numpy array
theta_lasso = np.stack(theta_list).T
#Plot results
n,_ = theta_lasso.shape
plt.figure(figsize = (12,8))
for i in range(n):
plt.plot(lamda, theta_lasso[i], label = data_macro_bb.columns[i])
plt.xscale('log')
plt.xlabel('Log($\\lambda$)')
plt.ylabel('Coefficients')
plt.title('Lasso Paths - Numpy implementation')
plt.legend()
plt.axis('tight')
commodities:
X = np.array(X_train).astype(np.float)
y = np.array(y_commo_train).astype(np.float)
m,n = X.shape
initial_theta = np.ones((n,1))
theta_list = list()
lamda = np.logspace(0,4,300)/10 #Range of lambda values
#Run lasso regression for each lambda
for l in lamda:
theta = coordinate_descent_lasso(initial_theta,X,y,lamda = l, num_iters=100)
theta_list.append(theta)
#Stack into numpy array
theta_lasso = np.stack(theta_list).T
#Plot results
n,_ = theta_lasso.shape
plt.figure(figsize = (12,8))
for i in range(n):
plt.plot(lamda, theta_lasso[i], label = data_macro_bb.columns[i])
plt.xscale('log')
plt.xlabel('Log($\\lambda$)')
plt.ylabel('Coefficients')
plt.title('Lasso Paths - Numpy implementation')
plt.legend()
plt.axis('tight')